# -*- coding: utf-8 -*-
"""
Created on 2018-09-13 09:19:13
Last Modified on 2018-09-13 09:19:13

Get sequence from a set of bed format region.

@Author: Ying Huang
"""
import argparse
from collections import deque
import os
import pandas as pd


def main():
    args = cmd()
    check_path(args)
    get_seq(args.idir, args.ibed, args.ofile)

    
def check_path(args):
    # Check input directory
    if not os.path.exists(args.idir):
        raise Exception(
            "'{}' not exist, please use 'splitFA' generate it.".format(args.idir))
    # Check ofile
    if os.path.exists(args.ofile):
        raise Exception(
            "'{}' already exist, please give a new ofile path.".format(args.ofile))
    # Check ibed
    if not os.path.exists(args.ibed) or not args.ibed.endswith('.bed'):
        raise Exception("Please input a exist file with suffix '.bed'.")
    

def cmd():
    
    msg = """
    Get sequence from a set of bed format region.
    """

    parser = argparse.ArgumentParser(description=msg)

    parser.add_argument(
        '-idir',
        dest='idir',
        action='store',
        help="Input directory stored separated FASTA file that generated by script 'splitFA'."
    )

    parser.add_argument(
        '-ibed',
        dest='ibed',
        action='store',
        help="Input a BED foramt file that contains required interval information."
    )

    parser.add_argument(
        '-o',
        dest='ofile',
        action='store',
        help="FASTA file to store output sequence."
    )

    args = parser.parse_args()

    return args


def get_seq(idir_fa, ibed, ofile):

    tmp_data = deque()
    
    bed = read_bed(ibed)

    chroms = set(bed.chr)

    for chrom in chroms:

        # Read separated FASTA file that generated by 'splitFA'.
        ifile_fa = os.path.join(idir_fa, chrom + '.fa')
        print("Reading FASTA file from '{}'...".format(ifile_fa))
        seq = read_fa(ifile_fa)

        for ch, start, end, name in bed.loc[(bed['chr'] == chrom)].values:

            # Search sequence by BED file information.
            content = search_seq(start, end, seq)
            # Make header of searched seq to write out.
            header = ">{}:{}-{}:0-base {} +".format(ch, str(start), str(end), name)
            # Store searched sequence and its header in tmp variable
            tmp_data.append(header + '\n' + content + '\n')

            # Write out when tmp variable stored 1000 genes sequence
            # (header and content)
            if len(tmp_data) >= 1000:
                print("Output 1000 genes sequence...")
                with open(ofile, 'a') as f:
                    f.write(''.join(tmp_data))
                tmp_data.clear()

    print("Output final genes sequence.")
    if len(tmp_data) > 0:
            with open(ofile, 'a') as f:
                f.write(''.join(tmp_data))
            tmp_data.clear()


def read_fa(ipath):

    fasta = deque()

    with open(ipath, 'r') as f:
        for line in f:
            if not line.startswith('>'):
                fasta.append(line.strip('\n'))

    return ''.join(fasta)


def read_bed(ipath):

    ## Read BED file by Pandas.
    # BED file should be sorted by "Chromsome, Start site, End site".
    # Only columns of "Chromsome, Start site, End site, Name" are needed.
    bed = pd.read_csv(ipath, sep='\t', header=None).sort_values([0, 1, 2])[[0, 1, 2, 3]]

    bed.columns = ['chr', 'start', 'end', 'name']

    # set each column dtype
    bed['chr'] = bed['chr'].astype(str)
    bed['start'] = bed['start'].astype(int)
    bed['end'] = bed['end'].astype(int)
    bed['name'] = bed['name'].astype(str)

    return bed


def search_seq(start, end, seq):
    """
    start: start site of seq, 0-base.
    end: end site of seq, 0-base.
    seq: DNA sequence of a chromsome.
    """

    assert isinstance(seq, str)
    assert isinstance(start, int) & isinstance(end, int)

    return seq[start: end + 1]
        

if __name__ == '__main__':
    main()
